# Assign locations using collected data
pacman::p_load(tidyverse, data.table, sf, cowplot, viridis, qs)


## Preliminaries
rm(list = ls())

source("code/globals.R")

#-------------------------------------------------------#
#---- Bring in ESRI geocodes -------------------------#
#-------------------------------------------------------#
# READ IN GEOCODED DATA AFTER ARCGIS PROCESSING

# Previously: temp_2022_06_17.RDS
location_info <- readRDS(file.path(INPUT_PRIVATE, "ESRI", "main_sample_geocoding.RDS")) %>% 
  st_set_geometry(NULL) %>%
  dplyr::select(ImportParcelID = USER_ImportParcelID, BuildingOrImprovementNumber = USER_BuildingOrImprovementNumber, 
                lat_ESRI = DisplayY, 
                lon_ESRI = DisplayX, 
                Side, Status, Score) %>%
  as.data.table()


#-------------------------------------------------------------------------#
#---- Attach ESRI geocodes to main homes dataset -------------------------#
#-------------------------------------------------------------------------#

zillowdata <- readRDS(file.path(WORKING, "FullRegDataDuplicatedControls.RDS"))
zillowdata <- zillowdata[, .(
  ImportParcelID, BuildingOrImprovementNumber,
  destroyed, YearBuilt, FIPS, incidentid, single_family,
  street, street100, street25, State,
  PropertyStreetPreDirectional,
  PropertyStreetName,
  PropertyStreetSuffix,
  PropertyStreetPostDirectional,
  PropertyZip,
  PropertyHouseNumber
)]
setnames(zillowdata, "incidentid", "UNIT_AND_NUMBER")
a1 <- fsetdiff(
  zillowdata[, .(ImportParcelID, BuildingOrImprovementNumber)],
  location_info[, .(ImportParcelID, BuildingOrImprovementNumber)]
)

b1 <- fsetdiff(
  location_info[, .(ImportParcelID, BuildingOrImprovementNumber)],
  zillowdata[, .(ImportParcelID, BuildingOrImprovementNumber)]
)
stopifnot(nrow(a1) == 0) ## I expect b1  >0 because I apply additional sample exclusions after exporting geocode dataset (I intentionally do this far upstream to get locations for as many homes as possible in case make changes later)

rm(a1, b1)
zillowdata <- merge(zillowdata, location_info, by = c("ImportParcelID", "BuildingOrImprovementNumber"))
zillowdata$destroyed <- as.factor(zillowdata$destroyed)
zillowdata <- data.table(zillowdata)
colSums(is.na(zillowdata))

# Note: few homes that don't match have very bad quality addresses. 
randomrows <- sample(1:nrow(zillowdata[Status == "U", ]), 500, replace = FALSE)
randomrows <- zillowdata[Status == "U", .(PropertyStreetName, PropertyStreetSuffix, PropertyHouseNumber, PropertyZip)][randomrows]
rm(randomrows)

#-------------------------------------------------------------------------#
#---- Attach rooftop footprint geocodes to main dataset -------------------------#
#-------------------------------------------------------------------------#

roofpoints <- qread(file.path(WORKING, "rooftop_locations.qs"))

a1 <- setdiff(
  zillowdata$ImportParcelID,
  roofpoints$ImportParcelID
)
b1 <- setdiff(
  roofpoints$ImportParcelID,
  zillowdata$ImportParcelID
)

## Until/unless I eventually get parcel shapefiles for all counties, including outside CA, there will always be many counties in zillowdata and not in roofpoints.
# I also expect b1 >0 because I apply additional sample exclusions after exporting geocode dataset (I intentionally do this far upstream to get locations for as many homes as possible in case make changes later)

rm(a1, b1)

stopifnot(sum(duplicated(roofpoints$ImportParcelID)) == 0)

zillowdata <- merge(zillowdata, subset(roofpoints, select = -FIPS), by = "ImportParcelID", all.x = TRUE) %>%
  mutate(
    geometry = NULL,
    roofpolygon = NULL
  ) %>%
  data.table()

colSums(is.na(zillowdata))

### Apply rules to choose location source for each parcel
# Primary rule: use building footprint when it exists, unless greater than 3 buildings on parcel (new subdivision problem in LA).

zillowdata <- zillowdata %>%
  mutate(loc_source = case_when(
    !is.na(lon_roof) & !is.na(lat_roof) & numfootprints <= 3 & !is.na(numfootprints) ~ "BldgFootprints",
    is.na(lon_roof) | is.na(lat_roof) | numfootprints > 3 | is.na(numfootprints) ~ "ESRI",
    TRUE ~ ""
  ))

### -----Exceptions --------------------------------------------###

## Use ESRI for all incidents where merge rate between homes and lot line shapefiles is less than 95%.
poormerges <- read.csv(file.path(WORKING, "mergerates_homes_to_lotlines.csv")) %>%
  dplyr::filter(mergerate < 0.95) %>%
  rename(UNIT_AND_NUMBER = incidentid)

zillowdata$loc_source[zillowdata$UNIT_AND_NUMBER %in% poormerges$UNIT_AND_NUMBER] <- "ESRI"

#* Any counties where GIS data are inaccurate, incorrectly projected in source data (and unfixable), etc.
#* This used to be Madera, Riverside, Orange, and maybe LA; in updated analysis have tried to fix - verify these counties in checking procedure.
## zillowdata$loc_source[as.numeric(zillowdata$FIPS) %in% c(6039,6065,6059)]<-"ESRI"

## County parcel boundaries for Lassen County, CA in the state omnibus file are very screwed up and unusable. For now I override with ESRI locations; eventually hope to get Lassen-specific parcel data.
zillowdata$loc_source[zillowdata$FIPS == 6035] <- "ESRI"

### Minor exception: For 7 California homes, use ESRI lat/long because the footprint location puts them in the ocean.
#* There are 7 coastal properties where the footprint locations get put in the ocean, outside the SRA/LRA layer and
#* thus return NA for the SRA/LRA/FRA merge. I investigated these by mapping and producing
#* a CSV to view in ArcGIS. WHat's happening here is that these are parcels where the MS building footprints layer does not
#* include any buildings for some reason, and so we use the location of the parcel centroid, which is a bit in the ocean because
#* of the curving nature of the coastline for these 5 properties. I solve this by using the ESRI locations for these 5 properties.
zillowdata$loc_source[zillowdata$ImportParcelID %in%
  c(12114183, 12114198, 12114199, 12114609, 21200182, 21426359, 21426379)] <- "ESRI"

## Create final lat/long variables based on these rules
# stopifnot(anyNA(zillowdata$loc_source)==FALSE)
table(zillowdata$loc_source, useNA = "always")
stopifnot(sum(!zillowdata$loc_source %in% c("BldgFootprints", "ESRI")) == 0)

zillowdata <- zillowdata %>%
  mutate(lon = case_when(
    loc_source == "BldgFootprints" ~ lon_roof,
    loc_source == "ESRI" ~ lon_ESRI
  )) %>%
  mutate(lat = case_when(
    loc_source == "BldgFootprints" ~ lat_roof,
    loc_source == "ESRI" ~ lat_ESRI
  ))

stopifnot(anyNA(zillowdata$lon) == FALSE & anyNA(zillowdata$lat) == FALSE)

### Backstop location accuracy by looking for points outside the reported COUNTY of the home. These
# are limited to ESRI, since by construction the parcel-based rooftop points are inside counties.
# If the assigned location is outside the allowed buffer around the county, override ESRI quality code with 'U'.
countybounds <- st_read(file.path(INPUT_PUBLIC, "CensusTiger", "tl_2019_us_county", "tl_2019_us_county.shp")) %>%
  dplyr::select(GEOID) %>%
  st_transform(crs = 3310) # This projection has units of meters, so this buffer should be measured in meters
backstopcheck <- zillowdata %>%
  dplyr::select(ImportParcelID, BuildingOrImprovementNumber, UNIT_AND_NUMBER, FIPS, lat, lon, loc_source) %>%
  st_as_sf(coords = c("lon", "lat"), crs = 4326, stringsAsFactors = FALSE, remove = TRUE) %>%
  st_transform(crs = st_crs(countybounds)) %>%
  st_join(countybounds, left = TRUE) %>%
  dplyr::filter(FIPS != as.numeric(GEOID)) %>%
  rename(mappedcounty = GEOID)
#-helper function to identify locations outside counties
checkinbuffer <- function(c) {
  o <- backstopcheck %>%
    dplyr::filter(FIPS == c)
  o$inbuffer <- st_is_within_distance(o,
    countybounds[as.numeric(countybounds$GEOID) == c, ],
    dist = 500,
    sparse = FALSE
  )
  o <- o %>%
    dplyr::filter(inbuffer == FALSE) %>% # keep those outside the allowed buffer distance
    st_drop_geometry()
  stopifnot(o$loc_source == "ESRI") # should be true by construction; make sure no weird errors.
  return(o)
}
alist <- lapply(unique(backstopcheck$FIPS), checkinbuffer)
a <- do.call(rbind, alist) %>%
  dplyr::select(ImportParcelID, BuildingOrImprovementNumber, UNIT_AND_NUMBER) %>%
  mutate(badlocation = 1)

# Verify unique merge keys and then merge in the bad location flag
stopifnot(anyDuplicated(a[, c("ImportParcelID", "BuildingOrImprovementNumber", "UNIT_AND_NUMBER")]) == 0)
stopifnot(anyDuplicated(zillowdata[, c("ImportParcelID", "BuildingOrImprovementNumber", "UNIT_AND_NUMBER")]) == 0)
zillowdata <- merge(zillowdata, a,
  by = c("ImportParcelID", "BuildingOrImprovementNumber", "UNIT_AND_NUMBER"),
  all.x = TRUE
) %>%
  mutate(Status = if_else(!is.na(badlocation),
    true = "U",
    false = Status
  )) %>%
  dplyr::select(-badlocation)
rm(countybounds, backstopcheck, alist, a)

###### Report on location quality by incident and then exclude the small number of homes that I cannot successfully geocode.
## Some notes: This drops homes that can be used in analyses that are not reliant on geographic information (e.g. pure time series). May want to revisit.
## I don't enforce any particular limit on fraction of homes that geocode succesfully to keep an incident. I could.
## Understand how ESRI handles non-matches: coordinates get returned as (0,0)
zillowdata$Status[zillowdata$loc_source == "BldgFootprints"] <- "F"
stopifnot(anyNA(zillowdata$Status) == FALSE)
table(zillowdata$Status, useNA = "always")
# By incident, what share of homes have usable locations?
locreport <- zillowdata %>%
  mutate(Status = if_else(Status == "F", "Footprint", Status)) %>%
  mutate(Status = if_else(Status == "M" | Status == "T", "ESRI", Status)) %>%
  mutate(Status = if_else(Status == "U", "Unusable", Status)) %>%
  group_by(FIPS, UNIT_AND_NUMBER, Status) %>%
  summarize(n()) %>%
  tidyr::pivot_wider(names_from = Status, values_from = `n()`, values_fill = 0) %>%
  mutate(tothomes = ESRI + Footprint + Unusable) %>%
  mutate(
    ESRI = round(100 * ESRI / tothomes, 2),
    Footprint = round(100 * Footprint / tothomes, 2),
    Unusable = round(100 * Unusable / tothomes, 2)
  ) %>%
  arrange(Unusable) %>%
  write.csv(file.path(WORKING, "locsource_by_incident.csv"))
rm(locreport)
## Drop homes with unusable locations (Status == 'U')
zillowdata <- zillowdata %>% dplyr::filter(!(loc_source == "ESRI" & Status == "U"))

qsave(zillowdata, file.path(WORKING, "zillowdata.qs"))



#-------------------------------------------------------------------#
#-------------------------------------------------------------------#
#---- OPTIONAL: Compare ESRI vs. Footprint locations ---------------#
#-------------------------------------------------------------------#
#-------------------------------------------------------------------#

# Note the set of locations won't be exactly the same here - I only do footprints for single family home types,
# and there are some homes that don't geocode successfully with one or the other method (or maybe both).

### Summarize differences in location by county, for homes with valid measures for both methods
compsample <- zillowdata[!is.na(lon_roof) & !is.na(lon_ESRI)] %>%
  group_by(FIPS) %>%
  dplyr::slice_sample(prop = 0.01) # scale this up after testing, but remember st_distance is slow.

geocode_sf <- st_as_sf(compsample[, c("ImportParcelID", "BuildingOrImprovementNumber", "lon_ESRI", "lat_ESRI")],
  coords = c("lon_ESRI", "lat_ESRI"),
  crs = 4326,
  stringsAsFactors = FALSE,
  remove = TRUE
)
roofs_sf <- st_as_sf(compsample[, c("ImportParcelID", "BuildingOrImprovementNumber", "lon_roof", "lat_roof")],
  coords = c("lon_roof", "lat_roof"),
  crs = 4326,
  stringsAsFactors = FALSE,
  remove = TRUE
)

compsample$diff <- st_distance(geocode_sf, roofs_sf, by_element = TRUE)
compsample <- data.table(compsample)
compsample[,
  j = list(
    p1 = quantile(diff, probs = 0.01),
    p5 = quantile(diff, probs = 0.05),
    p50 = quantile(diff, probs = 0.50),
    p95 = quantile(diff, probs = 0.95),
    p99 = quantile(diff, probs = 0.99),
    p999 = quantile(diff, probs = 0.999)
  ),
  by = FIPS
]
compsample[, j = list(
  p1 = quantile(diff, probs = 0.01),
  p5 = quantile(diff, probs = 0.05),
  p50 = quantile(diff, probs = 0.50),
  p95 = quantile(diff, probs = 0.95),
  p99 = quantile(diff, probs = 0.99),
  p999 = quantile(diff, probs = 0.999)
)]




#########################################################################
##### READ BUILDING FOOTPRINTS AND SAVE INCIDENT-SPECIFIC SUBSETS #######
#########################################################################

## Creating these small incident-specific files now speeds stuff up later. (I think these are only used in 4c_QC_Geocoding).

zillowdata <- qread(file.path(WORKING, "zillowdata.qs"))

IncidentBldgs <- function(k) {
  print(k)
  homesinthisfire <- zillowdata[zillowdata$UNIT_AND_NUMBER == k, ] %>%
    st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
    st_transform(st_crs(footprints))
  homesbuffered <- homesinthisfire %>%
    st_buffer(1000) %>% # This projection has units of meters, so this buffer should be measured in meters
    st_bbox() %>%
    st_as_sfc() %>%
    st_sf()
  bldgsnearthisfire <- st_join(footprints, homesbuffered, left = FALSE, join = st_intersects) # much faster and less sensitive to invalid geoms than st_crop()
  qsave(bldgsnearthisfire, file.path(WORKING, "buildings", paste0(k, "bldgs.qs")))
  p <- ggplot() +
    geom_sf(data = homesbuffered, color = "green", alpha = 0.5) +
    geom_sf(data = bldgsnearthisfire, color = "blue") +
    geom_sf(data = homesinthisfire, color = "red", alpha = 0.1)
  save_plot(file.path(WORKING, "buildings", paste0(k, "bldgs.png")), plot = p)
}

cores <- 4 # SSRDE cluster
# cores<-1 #Windows PC

## California
s <- "California"
vintage <- "2018release"
footprints <- st_read(file.path(INPUT_PUBLIC, "Microsoft_Buildings", vintage, paste0(s, ".geojson"))) %>% # slow (15 mins for CA)
  st_transform(crs = 3310) ## NAD 83 California Albers
firesinthisstate <- unique(zillowdata$UNIT_AND_NUMBER[zillowdata$State == s])
lapply(firesinthisstate, IncidentBldgs)
rm(footprints)

## Arizona
s <- "Arizona"
vintage <- "2021release"
footprints <- st_read(file.path(INPUT_PUBLIC, "Microsoft_Buildings", vintage, paste0(s, ".geojson"))) %>% # slow (15 mins for CA)
  st_transform(crs = 3310) ## NAD 83 California Albers
print(st_crs(footprints))
firesinthisstate <- unique(zillowdata$UNIT_AND_NUMBER[zillowdata$State == s])
lapply(firesinthisstate, IncidentBldgs)
rm(footprints)

## Colorado
s <- "Colorado"
vintage <- "2021release"
footprints <- st_read(file.path(INPUT_PUBLIC, "Microsoft_Buildings", vintage, paste0(s, ".geojson"))) %>% # slow (15 mins for CA)
  st_transform(crs = 3310) ## NAD 83 California Albers
firesinthisstate <- unique(zillowdata$UNIT_AND_NUMBER[zillowdata$State == s])
lapply(firesinthisstate, IncidentBldgs)
rm(footprints)

## Oregon
s <- "Oregon"
vintage <- "2021release"
footprints <- st_read(file.path(INPUT_PUBLIC, "Microsoft_Buildings", vintage, paste0(s, ".geojson"))) %>% # slow (15 mins for CA)
  st_transform(crs = 3310) ## NAD 83 California Albers
firesinthisstate <- unique(zillowdata$UNIT_AND_NUMBER[zillowdata$State == s])
lapply(firesinthisstate, IncidentBldgs)
rm(footprints)

## Washington
s <- "Washington"
vintage <- "2021release"
footprints <- st_read(file.path(INPUT_PUBLIC, "Microsoft_Buildings", vintage, paste0(s, ".geojson"))) %>% # slow (15 mins for CA)
  st_transform(crs = 3310) ## NAD 83 California Albers
firesinthisstate <- unique(zillowdata$UNIT_AND_NUMBER[zillowdata$State == s])
lapply(firesinthisstate, IncidentBldgs)
rm(footprints)
